suppressPackageStartupMessages(library(dplyr))
suppressPackageStartupMessages(library(readr))
suppressPackageStartupMessages(library(tidyr))
suppressPackageStartupMessages(library(igraph))
suppressPackageStartupMessages(library(caret))
suppressPackageStartupMessages(library(stringr))
suppressPackageStartupMessages(library(lubridate))
suppressPackageStartupMessages(library(ggplot2))
suppressPackageStartupMessages(library(tidytext))Drive BC Network Analysis
Data Loading and Initial Cleaning
data_files <- list.files(path = "../data/",
pattern = "drivebceventshist.*\\.csv",
full.names = TRUE)
data_files <- data_files[grep("(2018|2019|2020|2021|2022|2023)", data_files)]
# Define column types
col_types <- cols(
EVENT_TYPE = col_character(),
EVENT_SUBTYPE = col_character(),
SEVERITY = col_character(),
CREATED = col_character(),
AREA_NAME = col_character(),
HEAD_LATITUDE = col_double(),
HEAD_LONGITUDE = col_double(),
ROAD_NAME = col_character()
)
# Read and combine with consistent types
road_data <- bind_rows(
lapply(data_files, function(file) {
read_csv(file, col_types = col_types) %>%
select(
EVENT_TYPE,
EVENT_SUBTYPE,
SEVERITY,
CREATED,
AREA_NAME,
HEAD_LATITUDE,
HEAD_LONGITUDE,
ROAD_NAME
) %>%
mutate(CREATED = as.POSIXct(CREATED))
})
)head(road_data)nrow(road_data)[1] 1005385
Bipartite network for incidents and time factors
create_incident_time_network <- function(data_sample) {
incident_time_edges <- data_sample %>%
mutate(
hour_created = hour(as.POSIXct(CREATED)),
time_of_day = case_when(
hour_created >= 6 & hour_created < 12 ~ "Morning",
hour_created >= 12 & hour_created < 18 ~ "Afternoon",
hour_created >= 18 & hour_created < 24 ~ "Evening",
hour_created >= 0 & hour_created < 6 ~ "Night"
),
day_of_week = weekdays(CREATED),
season = case_when(
month(CREATED) %in% c(12, 1, 2) ~ "Winter",
month(CREATED) %in% c(3, 4, 5) ~ "Spring",
month(CREATED) %in% c(6, 7, 8) ~ "Summer",
TRUE ~ "Fall"
)
) %>%
select(EVENT_TYPE, SEVERITY, AREA_NAME, time_of_day, day_of_week, season) %>%
pivot_longer(cols = c(time_of_day, day_of_week, season), names_to = "time_factor", values_to = "time_value") %>%
select(EVENT_TYPE, time_value)
# Rest of the function remains the same
incident_time_nodes <- data.frame(
name = unique(c(incident_time_edges$EVENT_TYPE, incident_time_edges$time_value)),
type = c(rep(TRUE, length(unique(incident_time_edges$EVENT_TYPE))),
rep(FALSE, length(unique(incident_time_edges$time_value))))
)
incident_time_graph <- graph_from_data_frame(d = incident_time_edges,
vertices = incident_time_nodes,
directed = FALSE)
return(incident_time_graph)
}
# Stratified sampling
set.seed(123) # For reproducibility
road_data_sample <- road_data %>%
group_by(EVENT_TYPE) %>%
sample_frac(0.001) %>% # .001 is about 1000 sample size for our data
ungroup()
incident_time_network <- create_incident_time_network(road_data_sample)
# Determine the layout
layout <- layout_as_bipartite(incident_time_network)
plot(
incident_time_network,
layout = layout,
vertex.label.cex = 0.6,
vertex.size = 5,
vertex.label.dist = 1.5
)The sample_frac function is used to take a stratified sample from the road_data dataframe, ensuring that each EVENT_TYPE is proportionally represented.
in_degrees <- degree(incident_time_network, mode = "in")
degree_data <- data.frame(
node = V(incident_time_network)$name,
in_degree = in_degrees
)
# Filter to include only time values
time_values <- degree_data %>%
filter(grepl("^[A-Z][a-z]", node))
time_values <- time_values %>%
arrange(desc(in_degree))
ggplot(time_values, aes(x = reorder(node, in_degree), y = in_degree)) +
geom_bar(stat = "identity", fill = "blue") +
labs(title = "All Event occurrences time distribution", x = "Node", y = "In-Degree") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
coord_flip() Create a plot for each event type, using the entire dataset 2018-2023
event_types <- unique(road_data$EVENT_TYPE)
years <- 2018:2023
# Combined data frame for all event types and years
combined_data <- do.call(rbind, lapply(event_types, function(event_type) {
do.call(rbind, lapply(years, function(year) {
event_data <- road_data %>%
filter(EVENT_TYPE == event_type, year(CREATED) == year)
event_network <- create_incident_time_network(event_data)
event_degree_data <- data.frame(
node = V(event_network)$name,
in_degree = degree(event_network, mode = "in"),
event_type = event_type,
year = year
)
event_degree_data %>%
filter(grepl("^[A-Z][a-z]", node))
}))
}))
combined_data <- combined_data %>%
group_by(event_type, node) %>%
mutate(node_total = sum(in_degree)) %>%
ungroup() %>%
group_by(event_type) %>%
mutate(total_degree = sum(in_degree)) %>%
ungroup()
ggplot(combined_data,
aes(x = reorder_within(node, desc(node_total), event_type),
y = in_degree,
fill = factor(year))) +
geom_bar(stat = "identity", position = "dodge") +
labs(title = "In-Degree Distribution by Event Type and Year",
x = "Node",
y = "In-Degree",
fill = "Year") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
facet_wrap(~ reorder(event_type, desc(total_degree)),
scales = "free_x",
nrow = 2,
ncol = 3) +
scale_x_reordered()Region event distribution plot
# region event distribution plot
region_events <- road_data %>%
group_by(AREA_NAME, EVENT_TYPE) %>%
summarise(count = n(), .groups = 'drop') %>%
group_by(AREA_NAME) %>%
mutate(total_events = sum(count)) %>%
ungroup()
ggplot(region_events,
aes(x = count,
y = reorder(AREA_NAME, total_events),
fill = EVENT_TYPE)) +
geom_bar(stat = "identity", position = "stack") +
labs(title = "Event Distribution by Region (2018-2023)",
x = "Number of Events",
y = "Region",
fill = "Event Type") +
theme_minimal() +
theme(
axis.text.y = element_text(size = 8),
legend.position = "right",
plot.title = element_text(hjust = 0.5)
)
region_eventsRelationship between maintenance/construction and incidents
Data Preparation:
- Considered highways only
- Categorized events into “maintenance” (CONSTRUCTION/MAINTENANCE), “incident” (INCIDENT/COLLISION), or “other”
- Counted events by road and category
- Transformed data into wide format with maintenance and incident counts per road
Network Creation:
Created an adjacency matrix using distance between roads based on their maintenance/incident patterns Converted to an igraph network where:
- Nodes represent roads
- Edges connect roads with similar incident patterns (The code creates a distance-based network where roads are connected if they have similar numbers of maintenance events and incidents, using Euclidean distance to measure this similarity.)
- Edge weight determined by similarity threshold (mean distance)
road_network <- road_data %>%
filter(grepl("^(Hwy|Highway|Trans-Canada|BC-)", ROAD_NAME)) %>%
mutate(
event_category = case_when(
EVENT_TYPE %in% c("CONSTRUCTION", "MAINTENANCE") ~ "maintenance",
EVENT_TYPE %in% c("INCIDENT", "COLLISION") ~ "incident",
TRUE ~ "other"
)
) %>%
# Count events by road and category
group_by(ROAD_NAME, event_category) %>%
summarise(
count = n(),
.groups = 'drop'
) %>%
# Pivot to wide format
pivot_wider(
names_from = event_category,
values_from = count,
values_fill = 0
)
# Create adjacency matrix based on similar incident patterns
road_matrix <- road_network %>%
select(maintenance, incident) %>%
dist() %>%
as.matrix()
g <- graph_from_adjacency_matrix(
(road_matrix < mean(road_matrix)) * 1,
mode = "undirected",
weighted = TRUE
)
g <- simplify(g, remove.loops = TRUE)
# Node attributes
V(g)$name <- road_network$ROAD_NAME
V(g)$maintenance <- road_network$maintenance
V(g)$incidents <- road_network$incident
V(g)$ratio <- road_network$incident / (road_network$maintenance + 1)
correlation <- cor.test(road_network$maintenance, road_network$incident)
layout <- layout_on_grid(g)
incident_sizes <- log1p(V(g)$incidents)
norm_sizes <- 3 + (incident_sizes - min(incident_sizes)) /
(max(incident_sizes) - min(incident_sizes)) * 15
plot(g,
layout = layout,
vertex.size = norm_sizes,
vertex.color = ifelse(V(g)$ratio > median(V(g)$ratio),
adjustcolor("red", alpha=0.7),
adjustcolor("green", alpha=0.7)),
vertex.label.cex = 0.7,
vertex.label.dist = 0.8,
vertex.label.color = "black",
vertex.frame.color = "gray50",
edge.color = adjustcolor("gray40", alpha=0.2),
edge.width = 0.5,
main = paste("Road Network: Maintenance vs Incidents\n",
"Correlation =", round(correlation$estimate, 3)))
legend("bottomright",
legend=c("High incident/maintenance ratio", "Low incident/maintenance ratio"),
fill=c(adjustcolor("red", alpha=0.7), adjustcolor("green", alpha=0.7)),
cex=0.6,
bty="n")
# Print statistical summary
summary_stats <- road_network %>%
summarise(
correlation = cor(maintenance, incident),
roads_with_high_maintenance = sum(maintenance > mean(maintenance)),
roads_with_high_incidents = sum(incident > mean(incident))
)
print(summary_stats)# A tibble: 1 × 3
correlation roads_with_high_maintenance roads_with_high_incidents
<dbl> <int> <int>
1 0.948 15 14
# Ordered list of roads and weights
road_weights <- data.frame(
road = V(g)$name,
incidents = V(g)$incidents,
maintenance = V(g)$maintenance,
ratio = V(g)$ratio
) %>%
arrange(desc(ratio)) %>%
mutate(
formatted_output = sprintf(
"%s: %.2f incidents per maintenance (incidents: %d, maintenance: %d)",
road,
ratio,
incidents,
maintenance
)
)
cat("\nRoads ordered by incident/maintenance ratio:\n")
Roads ordered by incident/maintenance ratio:
cat(paste0(seq_along(road_weights$formatted_output), ". ",
road_weights$formatted_output,
collapse = "\n"))1. Highway 77: 29.83 incidents per maintenance (incidents: 179, maintenance: 5)
2. Highway 103: 14.89 incidents per maintenance (incidents: 268, maintenance: 17)
3. Highway 51: 8.88 incidents per maintenance (incidents: 213, maintenance: 23)
4. Highway 97D: 7.45 incidents per maintenance (incidents: 447, maintenance: 59)
5. Highway 52E: 7.00 incidents per maintenance (incidents: 7, maintenance: 0)
6. Highway 8: 6.06 incidents per maintenance (incidents: 1607, maintenance: 264)
7. Highway 31A: 5.38 incidents per maintenance (incidents: 210, maintenance: 38)
8. Highway 40: 5.00 incidents per maintenance (incidents: 15, maintenance: 2)
9. Highway 7B: 4.51 incidents per maintenance (incidents: 659, maintenance: 145)
10. Highway 41: 4.38 incidents per maintenance (incidents: 35, maintenance: 7)
11. Highway 91A: 4.36 incidents per maintenance (incidents: 659, maintenance: 150)
12. Highway 5A: 4.16 incidents per maintenance (incidents: 2427, maintenance: 582)
13. Highway 113: 4.00 incidents per maintenance (incidents: 4, maintenance: 0)
14. Highway 91: 3.91 incidents per maintenance (incidents: 4708, maintenance: 1204)
15. Highway 101: 2.70 incidents per maintenance (incidents: 426, maintenance: 157)
16. Highway 26: 2.23 incidents per maintenance (incidents: 820, maintenance: 367)
17. Highway 12: 2.02 incidents per maintenance (incidents: 736, maintenance: 363)
18. Highway 7A: 2.00 incidents per maintenance (incidents: 2, maintenance: 0)
19. Highway 22A: 1.91 incidents per maintenance (incidents: 42, maintenance: 21)
20. Highway 13: 1.72 incidents per maintenance (incidents: 136, maintenance: 78)
21. Highway 31: 1.69 incidents per maintenance (incidents: 446, maintenance: 263)
22. Highway 6 EW: 1.68 incidents per maintenance (incidents: 1906, maintenance: 1135)
23. Highway 17A: 1.62 incidents per maintenance (incidents: 263, maintenance: 161)
24. Highway 6 NS: 1.52 incidents per maintenance (incidents: 448, maintenance: 293)
25. Highway 3A: 1.52 incidents per maintenance (incidents: 2147, maintenance: 1416)
26. Highway 23: 1.48 incidents per maintenance (incidents: 2616, maintenance: 1764)
27. Highway 99: 1.47 incidents per maintenance (incidents: 10583, maintenance: 7202)
28. Highway 49: 1.45 incidents per maintenance (incidents: 530, maintenance: 364)
29. Highway 11: 1.42 incidents per maintenance (incidents: 664, maintenance: 468)
30. Highway 1: 1.41 incidents per maintenance (incidents: 39207, maintenance: 27824)
31. Highway 17 SFPR: 1.29 incidents per maintenance (incidents: 89, maintenance: 68)
32. Highway 97A: 1.27 incidents per maintenance (incidents: 1251, maintenance: 984)
33. Highway 6: 1.20 incidents per maintenance (incidents: 12, maintenance: 9)
34. Highway 93: 1.19 incidents per maintenance (incidents: 1096, maintenance: 923)
35. Highway 18: 1.17 incidents per maintenance (incidents: 132, maintenance: 112)
36. Highway 5: 1.07 incidents per maintenance (incidents: 6887, maintenance: 6412)
37. Highway 395: 1.07 incidents per maintenance (incidents: 30, maintenance: 27)
38. Highway 59: 1.07 incidents per maintenance (incidents: 15, maintenance: 13)
39. Highway 97C: 1.04 incidents per maintenance (incidents: 1197, maintenance: 1149)
40. Highway 4A: 1.03 incidents per maintenance (incidents: 72, maintenance: 69)
41. Highway 22: 1.02 incidents per maintenance (incidents: 126, maintenance: 122)
42. Highway 2: 1.02 incidents per maintenance (incidents: 583, maintenance: 573)
43. Highway 3: 0.91 incidents per maintenance (incidents: 6662, maintenance: 7305)
44. Highway 97: 0.89 incidents per maintenance (incidents: 13606, maintenance: 15373)
45. Highway 33: 0.88 incidents per maintenance (incidents: 592, maintenance: 670)
46. Highway 7: 0.85 incidents per maintenance (incidents: 2145, maintenance: 2522)
47. Highway 17: 0.83 incidents per maintenance (incidents: 2085, maintenance: 2525)
48. Highway 15: 0.76 incidents per maintenance (incidents: 606, maintenance: 795)
49. Highway 37A: 0.75 incidents per maintenance (incidents: 810, maintenance: 1081)
50. Highway 20: 0.65 incidents per maintenance (incidents: 1060, maintenance: 1628)
51. Highway 10: 0.63 incidents per maintenance (incidents: 545, maintenance: 865)
52. Highway 14: 0.62 incidents per maintenance (incidents: 1086, maintenance: 1758)
53. Highway 37: 0.60 incidents per maintenance (incidents: 1957, maintenance: 3282)
54. Highway 29: 0.59 incidents per maintenance (incidents: 847, maintenance: 1430)
55. Highway 24: 0.58 incidents per maintenance (incidents: 316, maintenance: 543)
56. Highway 1A: 0.57 incidents per maintenance (incidents: 124, maintenance: 218)
57. Highway 30: 0.56 incidents per maintenance (incidents: 132, maintenance: 235)
58. Highway 3B: 0.54 incidents per maintenance (incidents: 90, maintenance: 165)
59. Highway 95: 0.47 incidents per maintenance (incidents: 485, maintenance: 1039)
60. Highway 43: 0.47 incidents per maintenance (incidents: 69, maintenance: 147)
61. Highway 4: 0.45 incidents per maintenance (incidents: 900, maintenance: 2015)
62. Highway 52 E: 0.42 incidents per maintenance (incidents: 150, maintenance: 357)
63. Highway 28: 0.42 incidents per maintenance (incidents: 333, maintenance: 800)
64. Highway 16: 0.38 incidents per maintenance (incidents: 4110, maintenance: 10808)
65. Highway 39: 0.38 incidents per maintenance (incidents: 45, maintenance: 119)
66. Highway 97B: 0.36 incidents per maintenance (incidents: 82, maintenance: 229)
67. Highway 52 N: 0.35 incidents per maintenance (incidents: 87, maintenance: 245)
68. Highway 19: 0.34 incidents per maintenance (incidents: 1185, maintenance: 3532)
69. Highway 21: 0.33 incidents per maintenance (incidents: 45, maintenance: 137)
70. Highway 19A: 0.32 incidents per maintenance (incidents: 494, maintenance: 1559)
71. Highway 95A: 0.23 incidents per maintenance (incidents: 90, maintenance: 394)
72. Highway 62: 0.20 incidents per maintenance (incidents: 7, maintenance: 34)
73. Highway 9: 0.17 incidents per maintenance (incidents: 104, maintenance: 623)
74. Highway 27: 0.15 incidents per maintenance (incidents: 69, maintenance: 458)
75. Highway 35: 0.06 incidents per maintenance (incidents: 26, maintenance: 413)
76. Highway 118: 0.06 incidents per maintenance (incidents: 16, maintenance: 265)
77. Highway 99A: 0.04 incidents per maintenance (incidents: 2, maintenance: 55)
The high correlation suggests that maintenance work is reactive - roads with more incidents tend to receive more maintenance, rather than maintenance preventing incidents.
- Problem roads get more maintenance attention
- High-traffic roads naturally have both more incidents and require more maintenance
- Maintenance work itself may temporarily increase the likelihood of incidents
communities <- cluster_louvain(g)
V(g)$community <- membership(communities)
n_communities <- length(unique(V(g)$community))
community_colors <- rainbow(n_communities, alpha=0.6)
par(mfrow=c(1,2), mar=c(2,2,3,2))
layout_grid <- layout_on_grid(g)
plot(g,
layout = layout_grid,
vertex.size = norm_sizes,
vertex.color = community_colors[V(g)$community],
vertex.label.cex = 0.6,
vertex.label.dist = 1.2,
vertex.label.color = "black",
vertex.frame.color = "gray50",
edge.color = adjustcolor("gray40", alpha=0.2),
edge.width = 0.5,
main = "Grid Layout\nBC Highway Communities")
layout_fr <- layout_with_fr(g, niter=500, grid="nogrid")
plot(g,
layout = layout_fr,
vertex.size = norm_sizes,
vertex.color = community_colors[V(g)$community],
vertex.label.cex = 0.6,
vertex.label.dist = 1.2,
vertex.label.color = "black",
vertex.frame.color = "gray50",
edge.color = adjustcolor("gray40", alpha=0.2),
edge.width = 0.5,
main = "Fruchterman-Reingold Layout\nBC Highway Communities")
par(mfrow=c(1,1), mar=c(5,4,4,2)+0.1)
community_sizes <- table(membership(communities))
cluster_stats <- data.frame(
community = as.numeric(names(community_sizes)),
cluster_size = as.numeric(community_sizes),
avg_incidents = tapply(V(g)$incidents, V(g)$community, mean),
avg_maintenance = tapply(V(g)$maintenance, V(g)$community, mean)
)
cluster_stats <- cluster_stats[order(-cluster_stats$cluster_size), ]
cat("\nCluster Summary:\n")
Cluster Summary:
for(i in 1:nrow(cluster_stats)) {
community_id <- cluster_stats$community[i]
cat(sprintf("\nCluster %d (size: %d):\n",
community_id,
cluster_stats$cluster_size[i]))
cat("Members:", paste(V(g)$name[V(g)$community == community_id], collapse=", "), "\n")
cat(sprintf("Average incidents: %.1f\n", cluster_stats$avg_incidents[i]))
cat(sprintf("Average maintenance: %.1f\n", cluster_stats$avg_maintenance[i]))
}
Cluster 2 (size: 37):
Members: Highway 10, Highway 101, Highway 11, Highway 118, Highway 12, Highway 14, Highway 15, Highway 17A, Highway 19, Highway 19A, Highway 1A, Highway 2, Highway 20, Highway 24, Highway 26, Highway 27, Highway 28, Highway 29, Highway 30, Highway 31, Highway 33, Highway 35, Highway 37, Highway 37A, Highway 3B, Highway 4, Highway 49, Highway 52 E, Highway 52 N, Highway 6 NS, Highway 7B, Highway 9, Highway 91A, Highway 95, Highway 95A, Highway 97B, Highway 97D
Average incidents: 509.9
Average maintenance: 755.7
Cluster 3 (size: 23):
Members: Highway 103, Highway 113, Highway 13, Highway 17 SFPR, Highway 18, Highway 21, Highway 22, Highway 22A, Highway 31A, Highway 39, Highway 395, Highway 40, Highway 41, Highway 43, Highway 4A, Highway 51, Highway 52E, Highway 59, Highway 6, Highway 62, Highway 77, Highway 7A, Highway 99A
Average incidents: 76.3
Average maintenance: 48.0
Cluster 5 (size: 11):
Members: Highway 17, Highway 23, Highway 3A, Highway 5A, Highway 6 EW, Highway 7, Highway 8, Highway 91, Highway 93, Highway 97A, Highway 97C
Average incidents: 2107.7
Average maintenance: 1315.3
Cluster 6 (size: 2):
Members: Highway 3, Highway 5
Average incidents: 6774.5
Average maintenance: 6858.5
Cluster 1 (size: 1):
Members: Highway 1
Average incidents: 39207.0
Average maintenance: 27824.0
Cluster 4 (size: 1):
Members: Highway 16
Average incidents: 4110.0
Average maintenance: 10808.0
Cluster 7 (size: 1):
Members: Highway 97
Average incidents: 13606.0
Average maintenance: 15373.0
Cluster 8 (size: 1):
Members: Highway 99
Average incidents: 10583.0
Average maintenance: 7202.0